import pdfplumber
import pandas as pd
def pdf_to_excel2(file_path, excel_name):
    '''
    params:
        file_path：需要提取表格的pdf文件的绝对路径
        start：出现表格的起始页码
        end：表格结束页码
        excel_name：最后保存excel文件的文件名(默认为原始pdf文件名)
    '''
    pdf = pdfplumber.open(file_path)
    if not excel_name:
        excel_name = file_path.split('\\')[-1].split('.')[0]
    df_result = pd.DataFrame()
    for i in range(0, len(pdf.pages) - 1):
        page = pdf.pages[i]
        table = page.extract_table()
        df_result = df_result.append(table)
    # df_result = df_result.drop_duplicates(inplace=True)
    df_result.to_excel(excel_name + '.xlsx', index=False)


read_path = r'D:\Projects\PycharmProjects\pythonProject2\DataA\A103\\task2_3.pdf'
if __name__ == '__main__':
    pdf_to_excel2(file_path=read_path, excel_name='数据分析')

# def read_pdf(read_path, save_path):
#     pdf_2020 = pdfplumber.open(read_path)
#     result_df = pd.DataFrame()
#     for page in pdf_2020.pages:
#         table = page.extract_table()
#         print(table)
#         df_detail = pd.DataFrame(table[1:], columns=table[0])
#         result_df = pd.concat([df_detail, result_df], ignore_index=True)
#     result_df.dropna(axis=1, how='all', inplace=True)
#     result_df.columns = ['排名', '一', '二', '三']
#     result_df.to_excel(excel_writer=save_path, index=False, encoding='utf-8')
#
#
# save_path = r'hhhh.xlsx'
# if __name__ == '__main__':
#     read_pdf(read_path, save_path)
